{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Download Tweets with the Twitter Academic API"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Import the relevant libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import _locale\n",
    "_locale._getdefaultlocale = (lambda *args: ['en_US', 'utf_8_sig'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import os\n",
    "from glob import glob\n",
    "import json\n",
    "from ast import literal_eval\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import re\n",
    "import time\n",
    "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
    "import threading\n",
    "import concurrent.futures\n",
    "import config_ac_twitter # file storing the API credentials"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Function to retrieve tweets and parse them into a pandas dataframe. The function takes a search query, the maximal results it should return per request, and the time of the earliest and most recent tweet that should be returned as input. It returns a dataframe with tweets as rows and several characteristics of the tweet as columns."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_tweets(query, max_results, start_time, end_time):\n",
    "    \n",
    "    tweets_out = [] #create a list\n",
    "    l = 0\n",
    "\n",
    "    # first, the function needs to handle pagination. Twitters API returns 500 results maximum per request. \n",
    "    try:\n",
    "        while len(tweets_out) < 1000000:\n",
    "            \n",
    "            if l == 0: #pageination\n",
    "                query_params = {'query': query,\n",
    "                                'max_results' : max_results,\n",
    "                                'start_time' : start_time,\n",
    "                                'end_time' : end_time,\n",
    "                                'tweet.fields': 'author_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,referenced_tweets,text,attachments',\n",
    "                                'expansions': 'author_id,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id',\n",
    "                                'user.fields': 'id,username,name'\n",
    "                                }\n",
    "            else:\n",
    "                query_params = {'query': query,\n",
    "                                'max_results' : max_results,\n",
    "                                'start_time' : start_time,\n",
    "                                'end_time' : end_time,\n",
    "                                'tweet.fields': 'author_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,referenced_tweets,text,attachments',\n",
    "                                'expansions': 'author_id,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id',\n",
    "                                'user.fields': 'id,username,name',\n",
    "                                'next_token': tweets['meta']['next_token']\n",
    "                                }\n",
    "\n",
    "            # Copy pasted from andy piper's github: https://github.com/twitterdev/Twitter-API-v2-sample-code/blob/master/Full-Archive-Search/full-archive-search.py\n",
    "\n",
    "            def create_headers(bearer_token):\n",
    "                headers = {\"Authorization\": \"Bearer {}\".format(bearer_token)}\n",
    "                return headers\n",
    "\n",
    "            def connect_to_endpoint(url, headers, params):\n",
    "                response = requests.request(\"GET\", search_url, headers=headers, params=params)\n",
    "                if response.status_code != 200:\n",
    "                    print(response.status_code)\n",
    "                    raise Exception(response.status_code, response.text)\n",
    "                return response.json()\n",
    "\n",
    "            def main():\n",
    "                headers = create_headers(bearer_token)\n",
    "                json_response = connect_to_endpoint(search_url, headers, query_params)\n",
    "                return json_response #return as output\n",
    "\n",
    "            #Execute function to retrieve tweets\n",
    "            tweets = main()\n",
    "\n",
    "            #Parsing\n",
    "\n",
    "            tweets_clean = []\n",
    "\n",
    "            for i in range(0, tweets['meta']['result_count']):\n",
    "                tdict = {}\n",
    "\n",
    "                tdict['tweet_id'] = tweets['data'][i]['id']\n",
    "                tdict['datetime'] = tweets['data'][i]['created_at']\n",
    "                tdict['language'] = tweets['data'][i]['lang']\n",
    "                tdict['text'] = tweets['data'][i]['text'].encode('utf-8', 'replace').decode('utf-8') # some emojis may cause an error; this is avoided by replacing them with question marks\n",
    "\n",
    "                tdict['referenced_tweet_id'] = []\n",
    "                tdict['referenced_types'] = []\n",
    "                tdict['referenced_author_id'] = []\n",
    "                tdict['referenced_likes'] = []\n",
    "                tdict['referenced_retweets'] = []\n",
    "                tdict['referenced_quotes'] = []\n",
    "                tdict['referenced_replies'] = []\n",
    "                tdict['quoted_username'] = []\n",
    "\n",
    "                try:\n",
    "                    for j in range(len(tweets['data'][i]['referenced_tweets'])):\n",
    "                        tdict['referenced_tweet_id'].append(tweets['data'][i]['referenced_tweets'][j]['id'])\n",
    "                        tdict['referenced_types'].append(tweets['data'][i]['referenced_tweets'][j]['type'])\n",
    "                        \n",
    "                        for k in range(len(tweets['includes']['tweets'])):\n",
    "                            if tweets['data'][i]['referenced_tweets'][j]['id'] == tweets['includes']['tweets'][k]['id']:\n",
    "                                tdict['referenced_author_id'] = tweets['includes']['tweets'][k]['author_id']\n",
    "                                tdict['referenced_datetime'] = tweets['includes']['tweets'][k]['created_at']\n",
    "                                tdict['referenced_likes'] = tweets['includes']['tweets'][k]['public_metrics']['like_count']\n",
    "                                tdict['referenced_retweets'] = tweets['includes']['tweets'][k]['public_metrics']['retweet_count']\n",
    "                                tdict['referenced_quotes'] = tweets['includes']['tweets'][k]['public_metrics']['quote_count']\n",
    "                                tdict['referenced_replies'] = tweets['includes']['tweets'][k]['public_metrics']['reply_count']\n",
    "                            else:\n",
    "                                pass\n",
    "                except:\n",
    "                    pass\n",
    "                \n",
    "                try:\n",
    "                    tdict['retweeted_username'] = (re.findall(r'RT *@(.*?):', tweets['data'][i]['text']))\n",
    "                except:\n",
    "                    pass\n",
    "\n",
    "                try:\n",
    "                    tdict['in_reply_to_user_id'] = tweets['data'][i]['in_reply_to_user_id']\n",
    "                except:\n",
    "                    pass\n",
    "\n",
    "                tdict['hashtags'] = []\n",
    "                try:\n",
    "                    for j in range(len(tweets['data'][i]['entities']['hashtags'])):\n",
    "                        tdict['hashtags'].append(tweets['data'][i]['entities']['hashtags'][j]['tag'])\n",
    "                except:\n",
    "                    pass\n",
    "\n",
    "                tdict['mentions'] = []\n",
    "                try:\n",
    "                    for j in range(len(tweets['data'][i]['entities']['mentions'])):\n",
    "                        tdict['mentions'].append(tweets['data'][i]['entities']['mentions'][j]['username'])\n",
    "                except:\n",
    "                    pass\n",
    "\n",
    "                tdict['urls'] = []\n",
    "                try:\n",
    "                    for j in range(len(tweets['data'][i]['entities']['urls'])):\n",
    "                        tdict['urls'].append(tweets['data'][i]['entities']['urls'][j]['expanded_url'])\n",
    "                except:\n",
    "                    pass\n",
    "                \n",
    "                #Extract the username of the author of the quoted tweet\n",
    "                try:\n",
    "                    for n in range(len(tdict['referenced_types'])): #loop over referenced types...\n",
    "                        if tdict['referenced_types'][n] == \"quoted\": #...and if the tweet qoutes another...\n",
    "                            regex = 'twitter\\.com\\/(.*?)\\/status\\/{}'.format(tdict['referenced_tweet_id'][n]) #...build a regular expression that matches the username of the author by matching the link to the referenced tweet\n",
    "                            for o in range(len(tdict['urls'])): #apply the regex by looping over all urls in the tweet and store all matches under the quoted_username key of the dictionary\n",
    "                                tdict['quoted_username'].extend(re.findall(regex, tdict[\"urls\"][o]))\n",
    "                        else:\n",
    "                            pass\n",
    "                except:\n",
    "                    pass\n",
    "\n",
    "                tdict['ann_entity'] = []\n",
    "                tdict['ann_type'] = []\n",
    "                tdict['ann_prob'] = []\n",
    "                try:\n",
    "                    for j in range(len(tweets['data'][i]['entities']['annotations'])):\n",
    "                        tdict['ann_entity'].append(tweets['data'][i]['entities']['annotations'][j]['normalized_text'])\n",
    "                        tdict['ann_type'].append(tweets['data'][i]['entities']['annotations'][j]['type'])\n",
    "                        tdict['ann_prob'].append(tweets['data'][i]['entities']['annotations'][j]['probability'])\n",
    "                except:\n",
    "                    pass\n",
    "\n",
    "                tdict['geo_coordinates'] = []\n",
    "                tdict['geo_place_id'] = []\n",
    "                try:\n",
    "                    tdict['geo_coordinates'] = tweets['data'][i]['geo']['coordinates']['coordinates']\n",
    "                    tdict['geo_place_id'] = tweets['data'][i]['geo']['place_id']\n",
    "                except:\n",
    "                    pass\n",
    "                \n",
    "                tdict['likes'] = tweets['data'][i]['public_metrics']['like_count']\n",
    "                tdict['retweets'] = tweets['data'][i]['public_metrics']['retweet_count']\n",
    "                tdict['quotes'] = tweets['data'][i]['public_metrics']['quote_count']\n",
    "                tdict['replies'] = tweets['data'][i]['public_metrics']['reply_count']\n",
    "                tdict['author_id'] = tweets['data'][i]['author_id']\n",
    "                \n",
    "                for m in range(len(tweets['includes']['users'])):\n",
    "                    if tweets['data'][i]['author_id'] == tweets['includes']['users'][m]['id']:\n",
    "                        tdict['author_username'] = tweets['includes']['users'][m]['username']\n",
    "                    else:\n",
    "                        pass\n",
    "                \n",
    "                try:\n",
    "                    tdict['link_to_tweet'] = 'https://twitter.com/{}/status/{}'.format(tdict['author_username'], tweets['data'][i]['id'])\n",
    "                except:\n",
    "                    pass\n",
    "                \n",
    "                tdict['request_page'] = l + 1\n",
    "\n",
    "                tweets_clean.append(tdict)\n",
    "\n",
    "            time.sleep(1) #avoid rate limiting\n",
    "            \n",
    "            tweets_out.extend(tweets_clean)\n",
    "            \n",
    "            if (l+2)/300 in [*range(0,10000)]:\n",
    "                print('15min break')\n",
    "                time.sleep(901)\n",
    "            else:\n",
    "                pass\n",
    "        \n",
    "            # To track the progress every 100th page number will be printed\n",
    "\n",
    "            if (l+1)/100 in [*range(0,10000)]:\n",
    "                print(l+1)\n",
    "            else:\n",
    "                pass\n",
    "            \n",
    "            l+=1\n",
    "            \n",
    "        try:\n",
    "            print(\"Metadata of the last page:\", tweets['meta'])\n",
    "        except:\n",
    "            pass\n",
    "        \n",
    "        final_output = pd.DataFrame(tweets_out)\n",
    "        \n",
    "    except KeyError:\n",
    "        print(\"Finished\")\n",
    "        final_output = pd.DataFrame(tweets_out)\n",
    "    \n",
    "    return(final_output)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Download Tweets\n",
    "### SOPA/PIPA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "bearer_token = config_ac_twitter.BEARER_TOKEN\n",
    "search_url = 'https://api.twitter.com/2/tweets/search/all'\n",
    "\n",
    "query = \"#sopa OR (stop online piracy act) OR #pipa OR #protectip OR (protect IP act) OR (protect intellectual property act) OR #coica OR (combatting online infringement and copyright act)\" \n",
    "start_time = \"2010-01-01T00:00:00.000Z\"\n",
    "end_time = \"2011-01-01T00:00:00.000Z\"\n",
    "max_results = 500"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "400\n"
     ]
    },
    {
     "ename": "Exception",
     "evalue": "(400, '{\"errors\":[{\"parameters\":{\"query\":[\"#sopa OR (stop online piracy act) OR #pipa OR #protectip OR (protect IP act) OR (protect intellectual property act) OR #coica OR (combatting online infringement and copyright act)\"]},\"message\":\"There were errors processing your request: Ambiguous use of and as a keyword. Use a space to logically join two clauses, or \\\\\"and\\\\\" to find occurrences of and in text (at position 162)\"}],\"title\":\"Invalid Request\",\"detail\":\"One or more parameters to your request was invalid.\",\"type\":\"https://api.twitter.com/2/problems/invalid-request\"}')",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mException\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-7-0e38073bcc9b>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0msopa_pipa_2010\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mget_tweets\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mquery\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mquery\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstart_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mstart_time\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mend_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mend_time\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmax_results\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmax_results\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      2\u001b[0m \u001b[1;31m#sopa_pipa_2010.to_csv(\"US_Tweets_SOPA_PIPA_2010.csv\")\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m<ipython-input-3-bbd1ebbcddce>\u001b[0m in \u001b[0;36mget_tweets\u001b[1;34m(query, max_results, start_time, end_time)\u001b[0m\n\u001b[0;32m     47\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     48\u001b[0m             \u001b[1;31m#Execute function to retrieve tweets\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 49\u001b[1;33m             \u001b[0mtweets\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     50\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     51\u001b[0m             \u001b[1;31m#Parsing\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m<ipython-input-3-bbd1ebbcddce>\u001b[0m in \u001b[0;36mmain\u001b[1;34m()\u001b[0m\n\u001b[0;32m     43\u001b[0m             \u001b[1;32mdef\u001b[0m \u001b[0mmain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     44\u001b[0m                 \u001b[0mheaders\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcreate_headers\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbearer_token\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 45\u001b[1;33m                 \u001b[0mjson_response\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconnect_to_endpoint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msearch_url\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mquery_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     46\u001b[0m                 \u001b[1;32mreturn\u001b[0m \u001b[0mjson_response\u001b[0m \u001b[1;31m#return as output\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     47\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m<ipython-input-3-bbd1ebbcddce>\u001b[0m in \u001b[0;36mconnect_to_endpoint\u001b[1;34m(url, headers, params)\u001b[0m\n\u001b[0;32m     38\u001b[0m                 \u001b[1;32mif\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstatus_code\u001b[0m \u001b[1;33m!=\u001b[0m \u001b[1;36m200\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     39\u001b[0m                     \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstatus_code\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 40\u001b[1;33m                     \u001b[1;32mraise\u001b[0m \u001b[0mException\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstatus_code\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     41\u001b[0m                 \u001b[1;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjson\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     42\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mException\u001b[0m: (400, '{\"errors\":[{\"parameters\":{\"query\":[\"#sopa OR (stop online piracy act) OR #pipa OR #protectip OR (protect IP act) OR (protect intellectual property act) OR #coica OR (combatting online infringement and copyright act)\"]},\"message\":\"There were errors processing your request: Ambiguous use of and as a keyword. Use a space to logically join two clauses, or \\\\\"and\\\\\" to find occurrences of and in text (at position 162)\"}],\"title\":\"Invalid Request\",\"detail\":\"One or more parameters to your request was invalid.\",\"type\":\"https://api.twitter.com/2/problems/invalid-request\"}')"
     ]
    }
   ],
   "source": [
    "sopa_pipa_2010 = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "#sopa_pipa_2010.to_csv(\"US_Tweets_SOPA_PIPA_2010.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "time.sleep(901)\n",
    "start_time = \"2011-01-01T00:00:00.000Z\"\n",
    "end_time = \"2012-01-01T00:00:00.000Z\"\n",
    "sopa_pipa_2011 = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "sopa_pipa_2011.to_csv(\"US_Tweets_SOPA_PIPA_2011.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "time.sleep(901)\n",
    "start_time = \"2012-01-01T00:00:00.000Z\"\n",
    "end_time = \"2013-01-01T00:00:00.000Z\"\n",
    "sopa_pipa_2012 = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "sopa_pipa_2012.to_csv(\"US_Tweets_SOPA_PIPA_2012.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "start_time = \"2012-01-01T00:00:00.000Z\"\n",
    "end_time = \"2012-01-18T20:13:37.000Z\"\n",
    "sopa_pipa_2012a = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "sopa_pipa_2012a.to_csv(\"US_Tweets_SOPA_PIPA_2012a.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Merge the separate files and export to csv."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_2010 = pd.read_csv(\"US_Tweets_SOPA_PIPA_2010.csv\")\n",
    "sopa_2011 = pd.read_csv(\"US_Tweets_SOPA_PIPA_2011.csv\")\n",
    "sopa_2012 = pd.read_csv(\"US_Tweets_SOPA_PIPA_2012.csv\")\n",
    "sopa_2012a = pd.read_csv(\"US_Tweets_SOPA_PIPA_2012a.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_sopa_df = [sopa_2010, sopa_2011, sopa_2012, sopa_2012a]\n",
    "sopa_df = pd.concat(list_sopa_df)\n",
    "sopa_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# drop duplicate tweets and keep only the first \n",
    "sopa_df.drop_duplicates(subset =\"tweet_id\", keep = \"first\", inplace = True)\n",
    "len(sopa_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_df = sopa_df.drop(columns = [\"Unnamed: 0\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_df.to_csv(\"US_SOPA_Tweets_2010-2012.csv\", index = False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### EU Copyright"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Hashtags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"#publishersright OR #Europeforcreators OR #Europeforcreativity OR #EUforCreators OR #yes2copyright OR #yes2creativity OR #manifesto4copyright OR #NoCreatorsNoContent OR #CreateyourInternet OR #WeNeedCopyrightDirective OR #creatorsrights OR #saveoursound OR #article13 OR #copyrightdirective OR #article11 OR #artikel13 OR #urheberrechtsreform OR #urheberrechtsrichtline OR #ACTA2 OR #StopActa2 OR #uploadfilter OR #uploadfilters OR #saveyourinternet OR #linktax OR #fixcopyright OR #censorshipmachine OR #savethelink\" \n",
    "start_time = \"2013-01-01T00:00:00.000Z\"\n",
    "end_time = \"2018-01-01T00:00:00.000Z\"\n",
    "max_results = 500"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EU_copyright_hash_2013_17 = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "EU_copyright_hash_2013_17.to_csv(\"EU_Copyright_hash_2013-2017.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "time.sleep(901)\n",
    "start_time = \"2018-01-01T00:00:00.000Z\"\n",
    "end_time = \"2019-01-01T00:00:00.000Z\"\n",
    "EU_copyright_hash_2018 = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "EU_copyright_hash_2018.to_csv(\"EU_Copyright_hash_2018.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "time.sleep(901)\n",
    "start_time = \"2019-01-01T00:00:00.000Z\"\n",
    "end_time = \"2020-01-01T00:00:00.000Z\"\n",
    "EU_copyright_hash_2019 = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "EU_copyright_hash_2019.to_csv(\"EU_Copyright_hash_2019.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### English"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"(Copyright OR #copyrightreform OR #copyrightdirective) ((EU OR Commission OR Brussels OR Europe OR European OR euro OR Reda OR Oettinger OR Voss) OR ((Single Market) OR (Internal Market))) (directive OR reform OR proposal OR debate OR law OR bill OR initiative)\"\n",
    "start_time = \"2013-01-01T00:00:00.000Z\"\n",
    "end_time = \"2019-01-01T00:00:00.000Z\"\n",
    "max_results = 500"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EU_copyright_en1_2013_18 = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "EU_copyright_en1_2013_18.to_csv(\"EU_Copyright_en1_2013-2018.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "time.sleep(330)\n",
    "print(\"start\")\n",
    "start_time = \"2019-01-01T00:00:00.000Z\"\n",
    "end_time = \"2020-01-01T00:00:00.000Z\"\n",
    "EU_copyright_en1_2019 = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "EU_copyright_en1_2019.to_csv(\"EU_Copyright_en1_2019.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"(Copyright OR #copyrightreform OR #copyrightdirective) ((#article13 OR #article11 OR #article17 OR #article15) OR ((article 11) OR (article 13) OR (article 17) OR (article 15)))\"\n",
    "start_time = \"2013-01-01T00:00:00.000Z\"\n",
    "end_time = \"2020-01-01T00:00:00.000Z\"\n",
    "max_results = 500"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "time.sleep(901)\n",
    "print(\"start\")\n",
    "start_time = \"2013-01-01T00:00:00.000Z\"\n",
    "end_time = \"2020-01-01T00:00:00.000Z\"\n",
    "EU_copyright_en2_2013_2019 = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "EU_copyright_en2_2013_2019.to_csv(\"EU_Copyright_en2_2013_2019.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### German"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"(Copyright OR #copyrightreform OR #copyrightdirective OR Urheberrecht OR Urheberrechte) (EU OR Kommission OR Brüssel OR Europa OR Europäisch OR euro OR Reda OR Oettinger OR Voss OR Binnenmarkt) (richtlinie OR reform OR Gesetz OR entwurf OR Gesetzesentwurf OR Gesetzentwurf OR Debatte OR Initiative)\"\n",
    "start_time = \"2013-01-01T00:00:00.000Z\"\n",
    "end_time = \"2020-01-01T00:00:00.000Z\"\n",
    "max_results = 500"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#time.sleep(901)\n",
    "print(\"start\")\n",
    "EU_copyright_de1_2013_2019 = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "EU_copyright_de1_2013_2019.to_csv(\"EU_Copyright_de1_2013_2019.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"(Urheberrechtsreform OR Urheberrechtsrichtlinie OR Urheberrechtsgesetz) (EU OR Kommission OR Brüssel OR Europa OR Europäisch OR euro OR Reda OR Oettinger OR Voss OR Binnenmarkt)\"\n",
    "time.sleep(901)\n",
    "print(\"start\")\n",
    "EU_copyright_de2_2013_2019 = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "EU_copyright_de2_2013_2019.to_csv(\"EU_Copyright_de2_2013_2019.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"(Copyright OR #copyrightreform OR #copyrightdirective OR Urheberrecht OR Urheberrechte OR Urheberrechtsreform OR Urheberrechtsrichtlinie) ((#article13 OR #artikel13 OR #article11 OR #artikel11 OR #article17 OR #artikel17 OR #article15 OR artikel15) OR ((Artikel 11) OR (artikel 13) OR (artikel 17) OR (artikel 15)))\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#time.sleep(901)\n",
    "print(\"start\")\n",
    "EU_copyright_de3_2013_2019 = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "EU_copyright_de3_2013_2019.to_csv(\"EU_Copyright_de3_2013_2019.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### French"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"(Copyright OR #copyrightreform OR #copyrightdirective OR (droit d'auteur) OR (droits d'auteur)) ((UE OR Commission OR Bruxelles OR Europe OR Européenne OR euro OR Reda OR Oettinger OR Voss) OR ((marché unique))) (directive OR réforme OR loi OR proposition OR débat OR (projet de législation) OR initiative)\"\n",
    "time.sleep(901)\n",
    "print(\"start\")\n",
    "EU_copyright_fr1_2013_2019 = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "EU_copyright_fr1_2013_2019.to_csv(\"EU_Copyright_fr1_2013_2019.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"((droit d'auteur) OR (droits d'auteur)) ((#article13 OR #article11 OR #article17 OR #article15) OR ((article 11) OR (article 13) OR (article 17) OR (article 15)))\"\n",
    "EU_copyright_fr2_2013_2019 = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "EU_copyright_fr2_2013_2019.to_csv(\"EU_Copyright_fr2_2013_2019.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Italian"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"(Copyright OR #copyrightreform OR #copyrightdirective OR (diritto d'autore) OR (diritti d'autore)) ((UE OR Commissione OR Bruxelles OR Europa OR Europea OR euro OR Reda OR Oettinger OR Voss) OR ((mercato unico))) (direttiva OR riforma OR legge OR proposizione OR dibattito OR (disegno di legge) OR iniziativa)\"\n",
    "print(\"start\")\n",
    "EU_copyright_it1_2013_2019 = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "EU_copyright_it1_2013_2019.to_csv(\"EU_Copyright_it1_2013_2019.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"(Copyright OR #copyrightreform OR #copyrightdirective OR (diritto d'autore) OR (diritti d'autore)) ((#article13 OR #articolo13 OR #article11 OR #articolo11 OR #article17 OR #articolo17 OR #article15 OR #articolo15) OR ((articolo 11) OR (articolo 13) OR (articolo 17) OR (articolo 15)))\"\n",
    "print(\"start\")\n",
    "EU_copyright_it2_2013_2019 = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "EU_copyright_it2_2013_2019.to_csv(\"EU_Copyright_it2_2013_2019.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Spanish"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"(Copyright OR #copyrightreform OR #copyrightdirective OR (derecho de autor) OR (derechos de autor)) ((UE OR Comisión OR Bruselas OR Europa OR Europea OR euro OR Reda OR Oettinger OR Voss) OR ((mercado único))) (directiva OR reforma OR ley OR propuesta OR debate OR (proyecto de ley) OR iniciativa)\"\n",
    "time.sleep(901)\n",
    "print(\"start\")\n",
    "EU_copyright_es1_2013_2019 = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "EU_copyright_es1_2013_2019.to_csv(\"EU_Copyright_es1_2013_2019.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"(Copyright OR #copyrightreform OR #copyrightdirective OR (derecho de autor) OR (derecho de autor)) ((#article13 OR #artículo13 OR #article11 OR #artículo11 OR #article17 OR #artículo17 OR #article15 OR #artículo15) OR ((artículo 11) OR (artículo 13) OR (artículo 17) OR (artículo 15)))\"\n",
    "print(\"start\")\n",
    "EU_copyright_es2_2013_2019 = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "EU_copyright_es2_2013_2019.to_csv(\"EU_Copyright_es2_2013_2019.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Polish"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "time.sleep(901)\n",
    "query = \"(Copyright OR #copyrightreform OR #copyrightdirective OR (prawo autorskie) OR (prawa autorskie) OR (praw autorskich) OR (prawach autorskich)) ((UE OR Komisja OR Bruksela OR Europa OR Europejska OR euro OR Reda OR Oettinger OR Voss) OR ((jednolity rynek))) (dyrektywa OR reforma OR ustawa OR propozycja OR debata OR (projekt ustawy) OR inicjatywa)\"\n",
    "print(\"start\")\n",
    "EU_copyright_pl1_2013_2019 = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "EU_copyright_pl1_2013_2019.to_csv(\"EU_Copyright_pl1_2013_2019.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"((prawo autorskie) OR (prawa autorskie) OR (praw autorskich) OR (prawach autorskich)) ((#article13 OR #artykuł13 OR #article11 OR #artykuł11 OR #article17 OR #artykuł17 OR #article15 OR #artykuł15) OR ((artykuł 11) OR (artykuł 13) OR (artykuł 17) OR (artykuł 15)))\"\n",
    "print(\"start\")\n",
    "EU_copyright_pl2_2013_2019 = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "EU_copyright_pl2_2013_2019.to_csv(\"EU_Copyright_pl2_2013_2019.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Portuguese"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "time.sleep(901)\n",
    "query = \"(Copyright OR #copyrightreform OR #copyrightdirective OR (direito de autor) OR (direitos de autor)) ((UE OR Comissão OR Bruxelas OR Europa OR Europeia OR euro OR Reda OR Oettinger OR Voss) OR ((mercado comum))) (diretiva OR reforma OR lei OR rascunho OR debate OR (projeto de lei) OR iniciativa)\"\n",
    "print(\"start\")\n",
    "EU_copyright_pl1_2013_2019 = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "EU_copyright_pl1_2013_2019.to_csv(\"EU_Copyright_pt1_2013_2019.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"(Copyright OR #copyrightreform OR #copyrightdirective OR (direito de autor) OR (direitos de autor)) ((#artigo13 OR #artigo 13 OR #artigo11 OR #artigo11 OR #artigo17 OR #artigo17 OR #artigo15 OR #artigo15) OR ((artigo 11) OR (artigo 13) OR (artigo 17) OR (artigo 15)))\"\n",
    "print(\"start\")\n",
    "EU_copyright_pl2_2013_2019 = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "EU_copyright_pl2_2013_2019.to_csv(\"EU_Copyright_pt2_2013_2019.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Dutch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"(Copyright OR #copyrightreform OR #copyrightdirective OR auteursrecht OR auteursrechten) ((EU OR Commissie OR Brussel OR Europa OR Europese OR euro OR Reda OR Oettinger OR Voss) OR ((eengemaakte markt))) (richtlijn OR hervorming OR wet OR propositie OR debat OR Wetsvoorstel OR initiatief)\"\n",
    "print(\"start\")\n",
    "EU_copyright_nl1_2013_2019 = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "EU_copyright_nl1_2013_2019.to_csv(\"EU_Copyright_nl1_2013_2019.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"(Copyright OR #copyrightreform OR #copyrightdirective OR auteursrecht OR auteursrechten) ((#article13 OR #artikel13 OR #article11 OR #artikel11 OR #article17 OR #artikel17 OR #article15 OR artikel15) OR ((Artikel 11) OR (artikel 13) OR (artikel 17) OR (artikel 15)))\"\n",
    "print(\"start\")\n",
    "EU_copyright_nl2_2013_2019 = get_tweets(query = query, start_time = start_time, end_time = end_time, max_results = max_results)\n",
    "EU_copyright_nl2_2013_2019.to_csv(\"EU_Copyright_nl2_2013_2019.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Merge EU Copyright data and export."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "filenames = glob(\"EU_Copyright_*.csv\") #glob returns a list of filenames that we care about\n",
    "print(len(filenames)) # length of list\n",
    "print(filenames)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_EU = [] #load data into a list\n",
    "\n",
    "for file in filenames:\n",
    "    this_csv = pd.read_csv(file)\n",
    "    list_EU.append(this_csv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EU_df = pd.concat(list_EU)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# drop duplicate tweets and keep only the first \n",
    "EU_df.drop_duplicates(subset =\"tweet_id\", keep = \"first\", inplace = True)\n",
    "len(EU_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EU_df = EU_df.drop(columns = [\"Unnamed: 0\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EU_df.to_csv(\"EU_Tweets_2013-2019.csv\", index = False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Some Data Processing"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### SOPA/PIPA\n",
    "Import data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_df = pd.read_csv(\"US_SOPA_Tweets_2010-2012_processed_2.csv\", converters={\"retweeted_username\": literal_eval, \"quoted_username\": literal_eval, \"urls\": literal_eval})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Usernames... \n",
    "...extract usernames of those to which the tweet is a <ins>*reply*</ins>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_df['repliedto_username'] = 'NaN'\n",
    "sopa_df['repliedto_username'].replace('NaN', np.NaN)\n",
    "\n",
    "for i in range(len(sopa_df['text'])):\n",
    "    sopa_df['repliedto_username'][i] = []\n",
    "    sopa_df['repliedto_username'][i] = (re.findall(r'^@(.*?) +[^@]', sopa_df['text'][i]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_df['username_reply'] = 'NaN'\n",
    "sopa_df['username_reply'].replace('NaN',np.NaN)\n",
    "\n",
    "for i in range(len(sopa_df[\"repliedto_username\"])):\n",
    "    sopa_df['username_reply'][i] = []\n",
    "    try:\n",
    "        sopa_df['repliedto_username'][i] = sopa_df['repliedto_username'][i][2:(len(sopa_df['repliedto_username'][i])-2)]\n",
    "        if '@' in sopa_df['repliedto_username'][i]:\n",
    "            sopa_df['username_reply'][i].extend(sopa_df['repliedto_username'][i].split(\"@\"))\n",
    "        else:\n",
    "            sopa_df['username_reply'][i].append(sopa_df['repliedto_username'][i])\n",
    "    except:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(sopa_df)):\n",
    "    if len(sopa_df['username_reply'][i]) > 9:\n",
    "        print(len(sopa_df['username_reply'][i]), i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_df['rp_username1'] = 'NaN'\n",
    "sopa_df['rp_username1'].replace('NaN',np.NaN)\n",
    "sopa_df['rp_username2'] = 'NaN'\n",
    "sopa_df['rp_username2'].replace('NaN',np.NaN)\n",
    "sopa_df['rp_username3'] = 'NaN'\n",
    "sopa_df['rp_username3'].replace('NaN',np.NaN)\n",
    "sopa_df['rp_username4'] = 'NaN'\n",
    "sopa_df['rp_username4'].replace('NaN',np.NaN)\n",
    "sopa_df['rp_username5'] = 'NaN'\n",
    "sopa_df['rp_username5'].replace('NaN',np.NaN)\n",
    "sopa_df['rp_username6'] = 'NaN'\n",
    "sopa_df['rp_username6'].replace('NaN',np.NaN)\n",
    "sopa_df['rp_username7'] = 'NaN'\n",
    "sopa_df['rp_username7'].replace('NaN',np.NaN)\n",
    "sopa_df['rp_username8'] = 'NaN'\n",
    "sopa_df['rp_username8'].replace('NaN',np.NaN)\n",
    "sopa_df['rp_username9'] = 'NaN'\n",
    "sopa_df['rp_username9'].replace('NaN',np.NaN)\n",
    "sopa_df['rp_username10'] = 'NaN'\n",
    "sopa_df['rp_username10'].replace('NaN',np.NaN)\n",
    "sopa_df['rp_username11'] = 'NaN'\n",
    "sopa_df['rp_username11'].replace('NaN',np.NaN)\n",
    "sopa_df['rp_username12'] = 'NaN'\n",
    "sopa_df['rp_username12'].replace('NaN',np.NaN)\n",
    "\n",
    "for i in range(len(sopa_df)):\n",
    "    for j in range(len(sopa_df['username_reply'][i])):\n",
    "        if j == 0:\n",
    "            sopa_df['rp_username1'][i] = sopa_df['username_reply'][i][0]\n",
    "        elif j == 1:\n",
    "            sopa_df['rp_username2'][i] = sopa_df['username_reply'][i][1]\n",
    "        elif j == 2:\n",
    "            sopa_df['rp_username3'][i] = sopa_df['username_reply'][i][2]\n",
    "        elif j == 3:\n",
    "            sopa_df['rp_username4'][i] = sopa_df['username_reply'][i][3]\n",
    "        elif j == 4:\n",
    "            sopa_df['rp_username5'][i] = sopa_df['username_reply'][i][4]\n",
    "        elif j == 5:\n",
    "            sopa_df['rp_username6'][i] = sopa_df['username_reply'][i][5]\n",
    "        elif j == 6:\n",
    "            sopa_df['rp_username7'][i] = sopa_df['username_reply'][i][6]\n",
    "        elif j == 7:\n",
    "            sopa_df['rp_username8'][i] = sopa_df['username_reply'][i][7]\n",
    "        elif j == 8:\n",
    "            sopa_df['rp_username9'][i] = sopa_df['username_reply'][i][8]\n",
    "        elif j == 9:\n",
    "            sopa_df['rp_username10'][i] = sopa_df['username_reply'][i][9]\n",
    "        elif j == 10:\n",
    "            sopa_df['rp_username11'][i] = sopa_df['username_reply'][i][10]\n",
    "        elif j == 11:\n",
    "            sopa_df['rp_username12'][i] = sopa_df['username_reply'][i][11]\n",
    "        else:\n",
    "            pass"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "...unlist the usernames of the authors of the <ins>*retweeted*</ins> tweet\n",
    "\n",
    "1. Determine the highest number of users retweeted in one tweet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(sopa_df)):\n",
    "    if len(sopa_df['retweeted_username'][i]) > 5:\n",
    "        print(len(sopa_df['retweeted_username'][i]), i)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "2. Extract the usernames and put in single columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_df['rt_username1'] = 'NaN'\n",
    "sopa_df['rt_username1'].replace('NaN',np.NaN)\n",
    "sopa_df['rt_username2'] = 'NaN'\n",
    "sopa_df['rt_username2'].replace('NaN',np.NaN)\n",
    "sopa_df['rt_username3'] = 'NaN'\n",
    "sopa_df['rt_username3'].replace('NaN',np.NaN)\n",
    "sopa_df['rt_username4'] = 'NaN'\n",
    "sopa_df['rt_username4'].replace('NaN',np.NaN)\n",
    "sopa_df['rt_username5'] = 'NaN'\n",
    "sopa_df['rt_username5'].replace('NaN',np.NaN)\n",
    "sopa_df['rt_username6'] = 'NaN'\n",
    "sopa_df['rt_username6'].replace('NaN',np.NaN)\n",
    "sopa_df['rt_username7'] = 'NaN'\n",
    "sopa_df['rt_username7'].replace('NaN',np.NaN)\n",
    "sopa_df['rt_username8'] = 'NaN'\n",
    "sopa_df['rt_username8'].replace('NaN',np.NaN)\n",
    "\n",
    "for i in range(len(sopa_df)):\n",
    "    for j in range(len(sopa_df['retweeted_username'][i])):\n",
    "        if j == 0:\n",
    "            sopa_df['rt_username1'][i] = sopa_df['retweeted_username'][i][0]\n",
    "        elif j == 1:\n",
    "            sopa_df['rt_username2'][i] = sopa_df['retweeted_username'][i][1]\n",
    "        elif j == 2:\n",
    "            sopa_df['rt_username3'][i] = sopa_df['retweeted_username'][i][2]\n",
    "        elif j == 3:\n",
    "            sopa_df['rt_username4'][i] = sopa_df['retweeted_username'][i][3]\n",
    "        elif j == 4:\n",
    "            sopa_df['rt_username5'][i] = sopa_df['retweeted_username'][i][4]\n",
    "        elif j == 5:\n",
    "            sopa_df['rt_username6'][i] = sopa_df['retweeted_username'][i][5]\n",
    "        elif j == 6:\n",
    "            sopa_df['rt_username7'][i] = sopa_df['retweeted_username'][i][6]\n",
    "        elif j == 7:\n",
    "            sopa_df['rt_username8'][i] = sopa_df['retweeted_username'][i][7]\n",
    "        else:\n",
    "            pass"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "...unlist the usernames of the authors of the <ins>*quoted*</ins> tweet\n",
    "\n",
    "1. Determine the highest number of users quoted in one tweet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(sopa_df)):\n",
    "    if len(sopa_df['quoted_username'][i]) > 1:\n",
    "        print(len(sopa_df['quoted_username'][i]), i)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "2. Extract the usernames and put in single columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_df['qt_username1'] = 'NaN'\n",
    "sopa_df['qt_username1'].replace('NaN',np.NaN)\n",
    "\n",
    "for i in range(len(sopa_df)):\n",
    "    try:\n",
    "        sopa_df['qt_username1'][i] = sopa_df['quoted_username'][i][0]\n",
    "    except:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### EU Copyright\n",
    "Import data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EU_df = pd.read_csv(\"EU_Tweets_2013-2019.csv\", converters={\"retweeted_username\": literal_eval, \"quoted_username\": literal_eval, \"urls\": literal_eval})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Usernames... \n",
    "...extract usernames of those to which the tweet is a <ins>*reply*</ins>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_tweets = []\n",
    "\n",
    "for i in range(len(EU_df['text'])):\n",
    "    list_tweets.append(EU_df['text'][i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EU_df['repliedto_username'] = 'NaN'\n",
    "EU_df['repliedto_username'].replace('NaN', np.NaN)\n",
    "\n",
    "output = []\n",
    "\n",
    "def extract_usernames(tweet):\n",
    "    output.append(re.findall(r'^@(.*?) +[^@]', tweet))\n",
    "    \n",
    "for tweet in list_tweets:\n",
    "    extract_usernames(tweet)\n",
    "\n",
    "EU_df['repliedto_username'] = output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_usernames = []\n",
    "\n",
    "for i in range(len(EU_df['repliedto_username'])):\n",
    "    list_usernames.append(EU_df['repliedto_username'][i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EU_df['username_reply'] = 'NaN'\n",
    "EU_df['username_reply'].replace('NaN',np.NaN)\n",
    "\n",
    "output = []\n",
    "\n",
    "def split_usernames(username):\n",
    "    try:\n",
    "        if '@' in username[0]:\n",
    "            output.append(username[0].split('@'))\n",
    "        else:\n",
    "            output.append(username)\n",
    "    except:\n",
    "        output.append([])\n",
    "    \n",
    "for username in list_usernames:\n",
    "    split_usernames(username)\n",
    "\n",
    "EU_df['username_reply'] = output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(EU_df)):\n",
    "    if len(EU_df['username_reply'][i]) > 49:\n",
    "        print(len(EU_df['username_reply'][i]), i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EU_df['rp_username1'] = 'NaN'\n",
    "EU_df['rp_username1'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username2'] = 'NaN'\n",
    "EU_df['rp_username2'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username3'] = 'NaN'\n",
    "EU_df['rp_username3'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username4'] = 'NaN'\n",
    "EU_df['rp_username4'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username5'] = 'NaN'\n",
    "EU_df['rp_username5'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username6'] = 'NaN'\n",
    "EU_df['rp_username6'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username7'] = 'NaN'\n",
    "EU_df['rp_username7'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username8'] = 'NaN'\n",
    "EU_df['rp_username8'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username9'] = 'NaN'\n",
    "EU_df['rp_username9'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username10'] = 'NaN'\n",
    "EU_df['rp_username10'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username11'] = 'NaN'\n",
    "EU_df['rp_username11'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username12'] = 'NaN'\n",
    "EU_df['rp_username12'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username13'] = 'NaN'\n",
    "EU_df['rp_username13'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username14'] = 'NaN'\n",
    "EU_df['rp_username14'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username15'] = 'NaN'\n",
    "EU_df['rp_username15'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username16'] = 'NaN'\n",
    "EU_df['rp_username16'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username17'] = 'NaN'\n",
    "EU_df['rp_username17'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username18'] = 'NaN'\n",
    "EU_df['rp_username18'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username19'] = 'NaN'\n",
    "EU_df['rp_username19'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username20'] = 'NaN'\n",
    "EU_df['rp_username20'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username21'] = 'NaN'\n",
    "EU_df['rp_username21'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username22'] = 'NaN'\n",
    "EU_df['rp_username22'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username23'] = 'NaN'\n",
    "EU_df['rp_username23'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username24'] = 'NaN'\n",
    "EU_df['rp_username24'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username25'] = 'NaN'\n",
    "EU_df['rp_username25'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username26'] = 'NaN'\n",
    "EU_df['rp_username26'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username27'] = 'NaN'\n",
    "EU_df['rp_username27'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username28'] = 'NaN'\n",
    "EU_df['rp_username28'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username29'] = 'NaN'\n",
    "EU_df['rp_username29'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username30'] = 'NaN'\n",
    "EU_df['rp_username30'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username31'] = 'NaN'\n",
    "EU_df['rp_username31'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username32'] = 'NaN'\n",
    "EU_df['rp_username32'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username33'] = 'NaN'\n",
    "EU_df['rp_username33'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username34'] = 'NaN'\n",
    "EU_df['rp_username34'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username35'] = 'NaN'\n",
    "EU_df['rp_username35'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username36'] = 'NaN'\n",
    "EU_df['rp_username36'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username37'] = 'NaN'\n",
    "EU_df['rp_username37'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username38'] = 'NaN'\n",
    "EU_df['rp_username38'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username39'] = 'NaN'\n",
    "EU_df['rp_username39'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username40'] = 'NaN'\n",
    "EU_df['rp_username40'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username41'] = 'NaN'\n",
    "EU_df['rp_username41'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username42'] = 'NaN'\n",
    "EU_df['rp_username42'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username43'] = 'NaN'\n",
    "EU_df['rp_username43'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username44'] = 'NaN'\n",
    "EU_df['rp_username44'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username45'] = 'NaN'\n",
    "EU_df['rp_username45'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username46'] = 'NaN'\n",
    "EU_df['rp_username46'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username47'] = 'NaN'\n",
    "EU_df['rp_username47'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username48'] = 'NaN'\n",
    "EU_df['rp_username48'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username49'] = 'NaN'\n",
    "EU_df['rp_username49'].replace('NaN',np.NaN)\n",
    "EU_df['rp_username50'] = 'NaN'\n",
    "EU_df['rp_username50'].replace('NaN',np.NaN)\n",
    "\n",
    "for i in range(len(EU_df)):\n",
    "    for j in range(len(EU_df['username_reply'][i])):\n",
    "        if j == 0:\n",
    "            EU_df['rp_username1'][i] = EU_df['username_reply'][i][0]\n",
    "        elif j == 1:\n",
    "            EU_df['rp_username2'][i] = EU_df['username_reply'][i][1]\n",
    "        elif j == 2:\n",
    "            EU_df['rp_username3'][i] = EU_df['username_reply'][i][2]\n",
    "        elif j == 3:\n",
    "            EU_df['rp_username4'][i] = EU_df['username_reply'][i][3]\n",
    "        elif j == 4:\n",
    "            EU_df['rp_username5'][i] = EU_df['username_reply'][i][4]\n",
    "        elif j == 5:\n",
    "            EU_df['rp_username6'][i] = EU_df['username_reply'][i][5]\n",
    "        elif j == 6:\n",
    "            EU_df['rp_username7'][i] = EU_df['username_reply'][i][6]\n",
    "        elif j == 7:\n",
    "            EU_df['rp_username8'][i] = EU_df['username_reply'][i][7]\n",
    "        elif j == 8:\n",
    "            EU_df['rp_username9'][i] = EU_df['username_reply'][i][8]\n",
    "        elif j == 9:\n",
    "            EU_df['rp_username10'][i] = EU_df['username_reply'][i][9]\n",
    "        elif j == 10:\n",
    "            EU_df['rp_username11'][i] = EU_df['username_reply'][i][10]\n",
    "        elif j == 11:\n",
    "            EU_df['rp_username12'][i] = EU_df['username_reply'][i][11]\n",
    "        elif j == 12:\n",
    "            EU_df['rp_username13'][i] = EU_df['username_reply'][i][12]\n",
    "        elif j == 13:\n",
    "            EU_df['rp_username14'][i] = EU_df['username_reply'][i][13]\n",
    "        elif j == 14:\n",
    "            EU_df['rp_username15'][i] = EU_df['username_reply'][i][14]\n",
    "        elif j == 15:\n",
    "            EU_df['rp_username16'][i] = EU_df['username_reply'][i][15]\n",
    "        elif j == 16:\n",
    "            EU_df['rp_username17'][i] = EU_df['username_reply'][i][16]\n",
    "        elif j == 17:\n",
    "            EU_df['rp_username18'][i] = EU_df['username_reply'][i][17]\n",
    "        elif j == 18:\n",
    "            EU_df['rp_username19'][i] = EU_df['username_reply'][i][18]\n",
    "        elif j == 19:\n",
    "            EU_df['rp_username20'][i] = EU_df['username_reply'][i][19]\n",
    "        elif j == 20:\n",
    "            EU_df['rp_username21'][i] = EU_df['username_reply'][i][20]\n",
    "        elif j == 21:\n",
    "            EU_df['rp_username22'][i] = EU_df['username_reply'][i][21]\n",
    "        elif j == 22:\n",
    "            EU_df['rp_username23'][i] = EU_df['username_reply'][i][22]\n",
    "        elif j == 23:\n",
    "            EU_df['rp_username24'][i] = EU_df['username_reply'][i][23]\n",
    "        elif j == 24:\n",
    "            EU_df['rp_username25'][i] = EU_df['username_reply'][i][24]\n",
    "        elif j == 25:\n",
    "            EU_df['rp_username26'][i] = EU_df['username_reply'][i][25]\n",
    "        elif j == 26:\n",
    "            EU_df['rp_username27'][i] = EU_df['username_reply'][i][26]\n",
    "        elif j == 27:\n",
    "            EU_df['rp_username28'][i] = EU_df['username_reply'][i][27]\n",
    "        elif j == 28:\n",
    "            EU_df['rp_username29'][i] = EU_df['username_reply'][i][28]\n",
    "        elif j == 29:\n",
    "            EU_df['rp_username30'][i] = EU_df['username_reply'][i][29]\n",
    "        elif j == 30:\n",
    "            EU_df['rp_username31'][i] = EU_df['username_reply'][i][30]\n",
    "        elif j == 31:\n",
    "            EU_df['rp_username32'][i] = EU_df['username_reply'][i][31]\n",
    "        elif j == 32:\n",
    "            EU_df['rp_username33'][i] = EU_df['username_reply'][i][32]\n",
    "        elif j == 33:\n",
    "            EU_df['rp_username34'][i] = EU_df['username_reply'][i][33]\n",
    "        elif j == 34:\n",
    "            EU_df['rp_username35'][i] = EU_df['username_reply'][i][34]\n",
    "        elif j == 35:\n",
    "            EU_df['rp_username36'][i] = EU_df['username_reply'][i][35]\n",
    "        elif j == 36:\n",
    "            EU_df['rp_username37'][i] = EU_df['username_reply'][i][36]\n",
    "        elif j == 37:\n",
    "            EU_df['rp_username38'][i] = EU_df['username_reply'][i][37]\n",
    "        elif j == 38:\n",
    "            EU_df['rp_username39'][i] = EU_df['username_reply'][i][38]\n",
    "        elif j == 39:\n",
    "            EU_df['rp_username40'][i] = EU_df['username_reply'][i][39]\n",
    "        elif j == 40:\n",
    "            EU_df['rp_username41'][i] = EU_df['username_reply'][i][40]\n",
    "        elif j == 41:\n",
    "            EU_df['rp_username42'][i] = EU_df['username_reply'][i][41]\n",
    "        elif j == 42:\n",
    "            EU_df['rp_username43'][i] = EU_df['username_reply'][i][42]\n",
    "        elif j == 43:\n",
    "            EU_df['rp_username44'][i] = EU_df['username_reply'][i][43]\n",
    "        elif j == 44:\n",
    "            EU_df['rp_username45'][i] = EU_df['username_reply'][i][44]\n",
    "        elif j == 45:\n",
    "            EU_df['rp_username46'][i] = EU_df['username_reply'][i][45]\n",
    "        elif j == 46:\n",
    "            EU_df['rp_username47'][i] = EU_df['username_reply'][i][46]\n",
    "        elif j == 47:\n",
    "            EU_df['rp_username48'][i] = EU_df['username_reply'][i][47]\n",
    "        elif j == 48:\n",
    "            EU_df['rp_username49'][i] = EU_df['username_reply'][i][48]\n",
    "        elif j == 49:\n",
    "            EU_df['rp_username50'][i] = EU_df['username_reply'][i][49]    \n",
    "        else:\n",
    "            pass"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "...unlist the usernames of the authors of the <ins>*retweeted*</ins> tweet\n",
    "\n",
    "1. Determine the highest number of users retweeted in one tweet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(EU_df)):\n",
    "    if len(EU_df['retweeted_username'][i]) > 5:\n",
    "        print(len(EU_df['retweeted_username'][i]), i)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "2. Extract the usernames and put in single columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EU_df['rt_username1'] = 'NaN'\n",
    "EU_df['rt_username1'].replace('NaN',np.NaN)\n",
    "EU_df['rt_username2'] = 'NaN'\n",
    "EU_df['rt_username2'].replace('NaN',np.NaN)\n",
    "EU_df['rt_username3'] = 'NaN'\n",
    "EU_df['rt_username3'].replace('NaN',np.NaN)\n",
    "EU_df['rt_username4'] = 'NaN'\n",
    "EU_df['rt_username4'].replace('NaN',np.NaN)\n",
    "EU_df['rt_username5'] = 'NaN'\n",
    "EU_df['rt_username5'].replace('NaN',np.NaN)\n",
    "EU_df['rt_username6'] = 'NaN'\n",
    "EU_df['rt_username6'].replace('NaN',np.NaN)\n",
    "EU_df['rt_username7'] = 'NaN'\n",
    "EU_df['rt_username7'].replace('NaN',np.NaN)\n",
    "EU_df['rt_username8'] = 'NaN'\n",
    "EU_df['rt_username8'].replace('NaN',np.NaN)\n",
    "EU_df['rt_username9'] = 'NaN'\n",
    "EU_df['rt_username9'].replace('NaN',np.NaN)\n",
    "\n",
    "for i in range(len(EU_df)):\n",
    "    for j in range(len(EU_df['retweeted_username'][i])):\n",
    "        if j == 0:\n",
    "            EU_df['rt_username1'][i] = EU_df['retweeted_username'][i][0]\n",
    "        elif j == 1:\n",
    "            EU_df['rt_username2'][i] = EU_df['retweeted_username'][i][1]\n",
    "        elif j == 2:\n",
    "            EU_df['rt_username3'][i] = EU_df['retweeted_username'][i][2]\n",
    "        elif j == 3:\n",
    "            EU_df['rt_username4'][i] = EU_df['retweeted_username'][i][3]\n",
    "        elif j == 4:\n",
    "            EU_df['rt_username5'][i] = EU_df['retweeted_username'][i][4]\n",
    "        elif j == 5:\n",
    "            EU_df['rt_username6'][i] = EU_df['retweeted_username'][i][5]\n",
    "        elif j == 6:\n",
    "            EU_df['rt_username7'][i] = EU_df['retweeted_username'][i][6]\n",
    "        elif j == 7:\n",
    "            EU_df['rt_username8'][i] = EU_df['retweeted_username'][i][7]\n",
    "        elif j == 8:\n",
    "            EU_df['rt_username9'][i] = EU_df['retweeted_username'][i][8]\n",
    "        else:\n",
    "            pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EU_df.to_csv(\"EU_Tweets_2013-2019_processed1.csv\", index = False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "...unlist the usernames of the authors of the <ins>*quoted*</ins> tweet\n",
    "\n",
    "1. Determine the highest number of users quoted in one tweet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(EU_df)):\n",
    "    if len(EU_df['quoted_username'][i]) > 1:\n",
    "        print(len(EU_df['quoted_username'][i]), i)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "2. Extract the usernames and put in single columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EU_df['qt_username1'] = 'NaN'\n",
    "EU_df['qt_username1'].replace('NaN',np.NaN)\n",
    "\n",
    "for i in range(len(EU_df)):\n",
    "    try:\n",
    "        EU_df['qt_username1'][i] = EU_df['quoted_username'][i][0]\n",
    "    except:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EU_df.to_csv(\"EU_Tweets_2013-2019_processed2.csv\", index = False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Working with Links from Tweets"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Define two functions:\n",
    "1. `process_urls` extracts the name of the website from a list of list of urls."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_urls(lol_url_short):\n",
    "    start_time = time.time()\n",
    "    \n",
    "    #Initialize empty list to store results:\n",
    "    processed_urls = []\n",
    "    \n",
    "    #count = 0\n",
    "    \n",
    "    for url_list in lol_url_short:\n",
    "        url_dict = {}\n",
    "        url_dict[\"initial_url\"] = []\n",
    "        url_dict[\"website\"] = []\n",
    "        \n",
    "        if len(url_list) > 0:\n",
    "            for url in url_list:\n",
    "                url_dict[\"initial_url\"].append(url)\n",
    "                if \"broken link\" not in url:\n",
    "                    if \"/\" in url[8:]: #if there is a / in the link (after the http[s]://) extract everything between  http[s]://[www.] and the first /\n",
    "                        website_pre = re.findall(r'http[s]?\\:\\/\\/w{0}(w{3})?[.]?([^(]+?)[\\/\\?]', url)[0][1]\n",
    "                        url_dict[\"website\"].extend(re.findall(r'(.*)\\.', website_pre))\n",
    "                    else: #if not extract everything between http[s]://[www.] and the last dot\n",
    "                        url_dict[\"website\"].append(re.findall(r'http[s]?\\:\\/\\/w{0}(w{3})?[.]?([^(]+)[\\.]', url)[0][1])\n",
    "                else:\n",
    "                    pass\n",
    "                  \n",
    "        else:\n",
    "            pass\n",
    "        \n",
    "        processed_urls.append(url_dict)\n",
    "        \n",
    "        #count += 1\n",
    "        #print(count)\n",
    "\n",
    "    print(\"___%s seconds___\" % (time.time()-start_time)) \n",
    "        \n",
    "    return processed_urls"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "2. Define a function to expand shortened urls: `expand_urls`. While twitter provides the urls of those short urls created by twitter, expanded urls are not available when a user shortened an url. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def expand_urls(dict_url, index, folder):\n",
    "    \n",
    "    dict_url[\"expanded_url\"] = []\n",
    "    for i in range(len(dict_url[\"initial_url\"])):\n",
    "        if dict_url['initial_url'][i] != \"broken link\" and len(dict_url[\"initial_url\"][i]) < 31 and len(dict_url[\"website\"][i]) < 7:\n",
    "            try:\n",
    "                dict_url[\"expanded_url\"].append(requests.head(dict_url[\"initial_url\"][i]).headers[\"location\"])\n",
    "            except requests.ConnectionError:\n",
    "                dict_url[\"expanded_url\"].append(\"broken link\")\n",
    "            except KeyError:\n",
    "                dict_url[\"expanded_url\"].append(dict_url[\"initial_url\"][i])\n",
    "        else:\n",
    "            dict_url[\"expanded_url\"].append(dict_url[\"initial_url\"][i])\n",
    "        \n",
    "        if r\"http\" not in dict_url[\"expanded_url\"][i]:\n",
    "            dict_url[\"expanded_url\"][i] = dict_url[\"initial_url\"][i]\n",
    "        else:\n",
    "            pass\n",
    "        \n",
    "    with open(folder + \"/file{}.txt\".format(index), \"w\") as output:\n",
    "        output.write(str(dict_url[\"expanded_url\"]))\n",
    "        \n",
    "    #return dict_url[\"expanded_url\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### SOPA"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load data and seperate into several chunks."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\wmc010\\Anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3071: DtypeWarning: Columns (36,37,38,39,40,41,42,49,50,51,52,53,54,55,56) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n"
     ]
    }
   ],
   "source": [
    "sopa_df = pd.read_csv(\"US_SOPA_Tweets_2010-2012_processed_2.csv\", converters={\"retweeted_username\": literal_eval, \"quoted_username\": literal_eval, \"urls\": literal_eval})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_urls = sopa_df[\"urls\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Apply the `process_urls` function to extract the website names which are used to determine whether an url is shortened in the `expand_urls` function."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "___9.477051496505737 seconds___\n"
     ]
    }
   ],
   "source": [
    "sopa_urls = process_urls(sopa_urls)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Use multithreading to apply the `expand_urls` function and increase speed. This is the first expansion of the shortened urls."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "start_time = time.time()\n",
    "\n",
    "with concurrent.futures.ThreadPoolExecutor(max_workers = 200) as executor:\n",
    "    futures = [executor.submit(expand_urls, folder = \"sopa_firstexpansion\", dict_url = sopa_urls[i], index = i) for i in range(len(sopa_urls))]\n",
    "\n",
    "#sopa_urls_1 = [f.result() for f in futures]\n",
    "\n",
    "print(\"___%s seconds___\" % (time.time()-start_time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "filenames = glob(\"sopa_firstexpansion/file*.txt\") #glob returns a list of filenames that we care about\n",
    "filenames.sort(key=lambda f: int(re.sub('\\D', '', f)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_urls = [] #load data into a list\n",
    "\n",
    "for file in filenames:\n",
    "    with open(file) as f: \n",
    "        thistxt = literal_eval(f.read())\n",
    "        sopa_urls.append(thistxt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_urls[6624]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Repeat for the second expansion of the urls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_urls = process_urls(sopa_urls)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_urls[6624]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "start_time = time.time()\n",
    "\n",
    "with concurrent.futures.ThreadPoolExecutor(max_workers = 200) as executor:\n",
    "    futures = [executor.submit(expand_urls, folder = \"sopa_secondexpansion\", dict_url = sopa_urls[i], index = i) for i in range(len(sopa_urls))]\n",
    "\n",
    "#sopa_urls = [f.result() for f in futures]\n",
    "\n",
    "print(\"___%s seconds___\" % (time.time()-start_time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "filenames = glob(\"sopa_secondexpansion/file*.txt\") #glob returns a list of filenames that we care about\n",
    "filenames.sort(key=lambda f: int(re.sub('\\D', '', f)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_urls = [] #load data into a list\n",
    "\n",
    "for file in filenames:\n",
    "    with open(file) as f: \n",
    "        thistxt = literal_eval(f.read())\n",
    "        sopa_urls.append(thistxt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_urls[6624]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Apply the process_urls function once more to get the website names of the now expanded urls."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_urls = process_urls(sopa_urls)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_urls[6624]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Merge expanded and processed urls to the original dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_df = pd.merge(sopa_df[\"urls\"][0:10000], pd.DataFrame(sopa_urls), left_index = True, right_index = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_df.sample(60)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_df.to_csv(\"US_SOPA_Tweets_2010-2012_processed_3.csv\", index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### EU"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Apply the `process_urls` function to extract the website names which are used to determine whether an url is shortened in the `expand_urls` function."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EU_df = pd.read_csv(\"EU_Tweets_2013-2019_processed2.csv\", converters={\"retweeted_username\": literal_eval, \"quoted_username\": literal_eval, \"urls\": literal_eval})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EU_urls = EU_df[\"urls\"]\n",
    "EU_urls = process_urls(EU_urls)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Use multithreading to apply the `expand_urls` function and increase speed by a factor of ten. This is the first expansion of the shortened urls."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "start_time = time.time()\n",
    "\n",
    "with concurrent.futures.ThreadPoolExecutor() as executor:\n",
    "    futures = [executor.submit(expand_urls, dict_url) for dict_url in EU_urls]\n",
    "\n",
    "EU_urls = [f.result() for f in futures]\n",
    "\n",
    "print(\"___%s seconds___\" % (time.time()-start_time))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Repeat for the second expansion of the urls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EU_urls = process_urls(EU_urls)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "start_time = time.time()\n",
    "\n",
    "with concurrent.futures.ThreadPoolExecutor() as executor:\n",
    "    futures = [executor.submit(expand_urls, dict_url) for dict_url in EU_urls]\n",
    "\n",
    "EU_urls = [f.result() for f in futures]\n",
    "\n",
    "print(\"___%s seconds___\" % (time.time()-start_time))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Apply the process_urls function once more to get the website names of the now expanded urls."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EU_urls = process_urls(EU_urls)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Merge expanded and processed urls to the original dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EU_df = pd.merge(EU_df, pd.DataFrame(EU_urls), left_index = True, right_index = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EU_df.to_csv(\"EU_Tweets_2013-2019_processed_3.csv\", index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def expand_process_urls(df, filename):\n",
    "    start_time = time.time() #to measure execution time\n",
    "    \n",
    "    #Initialize empty columns\n",
    "    df[\"full_links_in_tweet_pre\"] = None\n",
    "    df[\"website_pre\"] = None\n",
    "    df[\"website_short_pre\"] = None\n",
    "    df[\"website\"] = None\n",
    "    df[\"website_short\"] = None\n",
    "    df[\"full_links_in_tweet\"] = None\n",
    "    \n",
    "    #loop over all tweets\n",
    "    for i in range(len(df[\"urls\"])):\n",
    "        \n",
    "        #initiate empty lists in the columns\n",
    "        df[\"full_links_in_tweet_pre\"][i] = []\n",
    "        df[\"website_pre\"][i] = [] \n",
    "        df[\"website_short_pre\"][i] = []\n",
    "        df[\"full_links_in_tweet\"][i] = []\n",
    "        df[\"website\"][i] = [] \n",
    "        df[\"website_short\"][i] = []\n",
    "\n",
    "        #if there is at least one link in the tweet... \n",
    "        \n",
    "        if len(df[\"urls\"][i]) > 0:\n",
    "                #...loop over all links and...\n",
    "            for j in range(len(df[\"urls\"][i])):\n",
    "                if \"/\" in df[\"urls\"][i][j][8:]: #if there is a / in the link (after the http[s]://) extract everything between  http[s]://[www.] and the first /\n",
    "                    df[\"website_pre\"][i].append(re.findall(r'http[s]?\\:\\/\\/w{0}(w{3})?[.]?([^(]+?)[\\/\\?]', df[\"urls\"][i][j])[0][1])\n",
    "                    df[\"website_short_pre\"][i].extend(re.findall(r'(.*)\\.', df[\"website_pre\"][i][j]))\n",
    "                else: #if not extract everything between http[s]://[www.] and the last dot\n",
    "                    df[\"website_pre\"][i].append(re.findall(r'http[s]?\\:\\/\\/w{0}(w{3})?[.]?([^(]+)[\\.]', df[\"urls\"][i][j])[0][1])\n",
    "                    df[\"website_short_pre\"][i].extend(re.findall(r'http[s]?\\:\\/\\/w{0}(w{3})?[.]?([^(]+)[\\.]', df[\"urls\"][i][j])[0][1])\n",
    "                \n",
    "                #as there frequently are links that are shortened two times and thus returned as a shortened url by the Twitter API, these needs to be expanded manually\n",
    "\n",
    "                #if the website name is shorter than 7 and the full url shorter than 31 it is likely still a shortened url and it is expanded once again\n",
    "                if len(df[\"website_short_pre\"][i][j]) < 7 and len(df[\"urls\"][i][j]) < 31:\n",
    "                    try:\n",
    "                        df[\"full_links_in_tweet\"][i].append(requests.head(df[\"urls\"][i][j]).headers[\"location\"])\n",
    "                        \n",
    "                        if \"/\" in df[\"full_links_in_tweet\"][i][j][7:]: #if there is a / in the link (after the http[s]://) extract everything between  http[s]://[www.] and the first /\n",
    "                            df[\"website\"][i].append(re.findall(r'http[s]?\\:\\/\\/w{0}(w{3})?[.]?([^(]+?)[\\/\\?]', df[\"full_links_in_tweet\"][i][j])[0][1])\n",
    "                            df[\"website_short\"][i].extend(re.findall(r'(.*)\\.', df[\"website\"][i][j]))\n",
    "                        else: #if not extract everything between http[s]://[www.] and the last dot\n",
    "                            df[\"website\"][i].append(re.findall(r'http[s]?\\:\\/\\/w{0}(w{3})?[.]?([^(]+)[\\.]', df[\"full_links_in_tweet\"][i][j])[0][1])\n",
    "                            df[\"website_short\"][i].extend(re.findall(r'http[s]?\\:\\/\\/w{0}(w{3})?[.]?([^(]+)[\\.]', df[\"full_links_in_tweet\"][i][j])[0][1])  \n",
    "                    \n",
    "                    except: #if the request doesnt work the link is broken\n",
    "                        df[\"full_links_in_tweet\"][i].append(\"broken link\")\n",
    "                      \n",
    "                else: #if the website name is longer than 6 and/or the full url longer than 30 just take the already expanded url\n",
    "                    df[\"full_links_in_tweet\"][i].append(df[\"urls\"][i][j])\n",
    "                    df[\"website\"][i].append(df[\"website_pre\"][i][j])\n",
    "                    df[\"website_short\"][i].append(df[\"website_short_pre\"][i][j])\n",
    "                    \n",
    "            # if there is no link in the tweet pass            \n",
    "        else:\n",
    "            pass\n",
    "        # To track the progress every 1000th tweet number will be printed\n",
    "\n",
    "        if (i+1)/1000 in [*range(0,800)]:\n",
    "            print(i+1)\n",
    "        else:\n",
    "            pass\n",
    "    \n",
    "    df = df.drop([\"full_links_in_tweet_pre\", \"website_pre\", \"website_short_pre\"], axis = 1)\n",
    "    \n",
    "    print(\"___%s seconds___\" % (time.time()-start_time))\n",
    "    \n",
    "    df.to_csv(filename, mode = \"a\", sep = \",\")\n",
    "    \n",
    "    #return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Salience Figure\n",
    "### SOPA/PIPA\n",
    "Create datetime index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_df['datetime_index'] = pd.to_datetime(sopa_df['datetime'], format=\"%Y %m %d\")\n",
    "sopa_df = sopa_df.set_index('datetime_index')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_df = sopa_df.loc['2010-06-01':'2012-06-01']\n",
    "sopa_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Plotting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt \n",
    "import matplotlib.dates as dates\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "daily = sopa_df.resample('D').size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "weekly = sopa_df.resample('W').size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas.plotting import register_matplotlib_converters\n",
    "register_matplotlib_converters() #datetime function is not integrated beyond pandas this library allows matplotlib to process it\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(10, 7.5))\n",
    "ax.plot(daily, color='black')\n",
    "\n",
    "# add a vertical line for American Censorship Day and Internet Blackout\n",
    "#ax.axvline(pd.to_datetime('2011-11-16'), color='crimson', linestyle='--', lw=1.5) # tell where the line is with pandas to datetime function\n",
    "#ax.axvline(pd.to_datetime('2012-01-18'), color='blue', linestyle='--', lw=1.5)\n",
    "#ax.axvline(pd.to_datetime('2011-12-22'), color='green', linestyle='--', lw=1.5)\n",
    "\n",
    "#delete frame lines at the top and to the right\n",
    "ax.spines['right'].set_visible(False)\n",
    "ax.spines['top'].set_visible(False)\n",
    "\n",
    "#labels and ticks\n",
    "ax.set_xlabel('Daily observations')\n",
    "ax.set_ylabel('Number of Tweets')\n",
    "ax.tick_params(axis ='both', which ='both', length = 2) #length of tick marks\n",
    "# tick labels\n",
    "ax.xaxis.set_major_locator(dates.YearLocator())\n",
    "ax.xaxis.set_major_formatter(dates.DateFormatter('%m\\n%Y'))\n",
    "ax.xaxis.set_minor_locator(dates.MonthLocator())\n",
    "ax.xaxis.set_minor_formatter(dates.DateFormatter('%m'))\n",
    "\n",
    "ax.set_title('Salience of COICA, PIPA, and SOPA on English Speaking Twitter\\n')\n",
    "\n",
    "\n",
    "#annotation\n",
    "ax.annotate('Introduction \\nCOICA',\n",
    "           xytext=[pd.Timestamp('2010-06'), 60000],\n",
    "           xy=[pd.to_datetime('2010-09-20'), daily.loc[pd.to_datetime('2010-09-20')]],\n",
    "            color='green',\n",
    "           arrowprops={'arrowstyle': '-|>',\n",
    "                      'color': 'green'})\n",
    "ax.annotate('Hearing \\nOnline Piracy',\n",
    "           xytext=[pd.Timestamp('2010-08'), 130000],\n",
    "           xy=[pd.to_datetime('2011-03-14'), daily.loc[pd.to_datetime('2011-03-14')]],\n",
    "            color='green',\n",
    "           arrowprops={'arrowstyle': '-|>',\n",
    "                      'color': 'green'})\n",
    "ax.annotate('Introduction \\nPIPA',\n",
    "           xytext=[pd.Timestamp('2010-10'), 200000],\n",
    "           xy=[pd.to_datetime('2011-05-12'), daily.loc[pd.to_datetime('2011-05-12')]],\n",
    "            color='green',\n",
    "           arrowprops={'arrowstyle': '-|>',\n",
    "                      'color': 'green'})\n",
    "ax.annotate('Introduction \\nSOPA',\n",
    "           xytext=[pd.Timestamp('2010-12'), 270000],\n",
    "           xy=[pd.to_datetime('2011-10-26'), daily.loc[pd.to_datetime('2011-10-26')]],\n",
    "            color='green',\n",
    "           arrowprops={'arrowstyle': '-|>',\n",
    "                      'color': 'green'})\n",
    "ax.annotate('American \\nCensorship \\nDay',\n",
    "           xytext=[pd.Timestamp('2011-02'), 340000],\n",
    "           xy=[pd.to_datetime('2011-11-16'), daily.loc[pd.to_datetime('2011-11-16')]],\n",
    "            color='black',\n",
    "           arrowprops={'arrowstyle': '-|>',\n",
    "                      'color': 'black'})\n",
    "ax.annotate('Hearing \\nSOPA',\n",
    "           xytext=[pd.Timestamp('2011-05'), 410000],\n",
    "           xy=[pd.to_datetime('2011-12-16'), daily.loc[pd.to_datetime('2011-12-16')]],\n",
    "            color='green',\n",
    "           arrowprops={'arrowstyle': '-|>',\n",
    "                      'color': 'green'})\n",
    "ax.annotate('GoDaddy \\nBoycott',\n",
    "           xytext=[pd.Timestamp('2011-07'), 480000],\n",
    "           xy=[pd.to_datetime('2011-12-23'), daily.loc[pd.to_datetime('2011-12-23')]],\n",
    "            color='blue',\n",
    "           arrowprops={'arrowstyle': '-|>',\n",
    "                       'color': 'blue'})\n",
    "ax.annotate('Internet \\nBlackout',\n",
    "           xytext=[pd.Timestamp('2011-09'), 550000],\n",
    "           xy=[pd.to_datetime('2012-01-18'), daily.loc[pd.to_datetime('2012-01-18')]],\n",
    "            color='black',\n",
    "           arrowprops={'arrowstyle': '-|>',\n",
    "                      'color': 'black'})\n",
    "\n",
    "plt.savefig('fig_twitter.png', format = 'png', dpi = 2000, bbox_inches = 'tight')\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### EU Copyright"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create datetime index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EU_df['datetime_index'] = pd.to_datetime(EU_df['datetime'], format=\"%Y %m %d\")\n",
    "EU_df = EU_df.set_index('datetime_index')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EU_df = EU_df.loc['2016-01-01':'2020-01-01']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EU_en = EU_df[EU_df[\"language\"] == \"en\"]\n",
    "EU_de = EU_df[EU_df[\"language\"] == \"de\"]\n",
    "EU_fr = EU_df[EU_df[\"language\"] == \"fr\"]\n",
    "EU_it = EU_df[EU_df[\"language\"] == \"it\"]\n",
    "EU_nl = EU_df[EU_df[\"language\"] == \"nl\"]\n",
    "EU_pl = EU_df[EU_df[\"language\"] == \"pl\"]\n",
    "EU_es = EU_df[EU_df[\"language\"] == \"es\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Plotting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt \n",
    "import matplotlib.dates as dates\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "daily = EU_df.resample('D').size()\n",
    "daily_en = EU_en.resample('D').size()\n",
    "daily_de = EU_de.resample('D').size()\n",
    "daily_fr = EU_fr.resample('D').size()\n",
    "daily_it = EU_it.resample('D').size()\n",
    "daily_nl = EU_nl.resample('D').size()\n",
    "daily_pl = EU_pl.resample('D').size()\n",
    "daily_es = EU_es.resample('D').size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "weekly = EU_df.resample('W').size()\n",
    "weekly_en = EU_en.resample('W').size()\n",
    "weekly_de = EU_de.resample('W').size()\n",
    "weekly_fr = EU_fr.resample('W').size()\n",
    "weekly_it = EU_it.resample('W').size()\n",
    "weekly_nl = EU_nl.resample('W').size()\n",
    "weekly_pl = EU_pl.resample('W').size()\n",
    "weekly_es = EU_es.resample('W').size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas.plotting import register_matplotlib_converters\n",
    "register_matplotlib_converters() #datetime function is not integrated beyond pandas this library allows matplotlib to process it\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(10, 7.5))\n",
    "ax.plot(weekly_de, color='black')\n",
    "ax.plot(weekly_en, color='blue')\n",
    "ax.plot(weekly_fr, color='red')\n",
    "ax.plot(weekly_it, color='green')\n",
    "ax.plot(weekly_nl, color='orange')\n",
    "ax.plot(weekly_pl, color='purple')\n",
    "ax.plot(weekly_es, color='yellow')\n",
    "\n",
    "# add a vertical line for American Censorship Day and Internet Blackout\n",
    "#ax.axvline(pd.to_datetime('2011-11-16'), color='crimson', linestyle='--', lw=1.5) # tell where the line is with pandas to datetime function\n",
    "#ax.axvline(pd.to_datetime('2012-01-18'), color='blue', linestyle='--', lw=1.5)\n",
    "#ax.axvline(pd.to_datetime('2011-12-22'), color='green', linestyle='--', lw=1.5)\n",
    "\n",
    "#delete frame lines at the top and to the right\n",
    "ax.spines['right'].set_visible(False)\n",
    "ax.spines['top'].set_visible(False)\n",
    "\n",
    "#labels and ticks\n",
    "ax.set_xlabel('Daily observations')\n",
    "ax.set_ylabel('Number of Tweets')\n",
    "ax.tick_params(axis ='both', which ='both', length = 2) #length of tick marks\n",
    "# tick labels\n",
    "ax.xaxis.set_major_locator(dates.YearLocator())\n",
    "ax.xaxis.set_major_formatter(dates.DateFormatter('%m\\n%Y'))\n",
    "ax.xaxis.set_minor_locator(dates.MonthLocator())\n",
    "ax.xaxis.set_minor_formatter(dates.DateFormatter('%m'))\n",
    "\n",
    "ax.set_title('Salience of EU Copyright Reform on Twitter\\n')\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Natural Language Processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "from string import punctuation\n",
    "from nltk.corpus import stopwords\n",
    "import spacy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Remove Punctuation, convert to lower case, and remove double spaces"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_df['text'] = [\"\".join([l for l in tweet if l not in punctuation]) for tweet in sopa_df['text']]\n",
    "sopa_df['text'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_df['text'] = [tweet.lower() for tweet in sopa_df['text']]  # convert to lower case\n",
    "sopa_df['text'] = [\" \".join(tweet.split()) for tweet in sopa_df['text']]   # remove double spaces by splitting the strings into words and joining these words again\n",
    "sopa_df['text'].tail()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Add index for import in R"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_df[\"index\"] = range(len(sopa_df))\n",
    "sopa_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sopa_df.to_csv(\"US_SOPA_Tweets_2010-2012_processed_2.csv\", encoding='utf-8')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
